What is your opinion about Python vs R debate? To what extent do you agree with the post on https://www.dataschool.io/python-or-r-for-data-science/? Be honest, you won’t be penalized or rewarded for stating your opinions; only by the quality your arguments.
What is your exploratory data analysis workflow? Suppose you are given a data set and a research question. Where do you start? How do you proceed? For instance, you are given the task to distribute funds from donations to public welfare projects in a wide range of subjects (e.g. education, gender equality, poverty, job creation, healthcare) with the objective of maximum positive impact on the society in general. Assume you have almost all the data you require. How do you measure impact? How do you form performance measures? What makes you think you find an interesting angle?
If you had to plot a single graph using the flights data what would it be? Why? Make your argument, actually code the plot and provide the output. (You can find detailed info about the movies data set in its help file. Use ?flights, after you load nycflights13 package.)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(nycflights13)
library(ggplot2)
glimpse(flights)
## Observations: 336,776
## Variables: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,...
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 55...
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 60...
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2...
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 7...
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 7...
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -...
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV",...
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79...
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN...
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR"...
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL"...
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138...
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 94...
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5,...
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013...
ggplot(flights, aes(flights$dep_delay, flights$air_time, colour = flights$flight)) +
geom_point()
## Warning: Removed 9430 rows containing missing values (geom_point).
#required packages
library("tidyverse")
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 1.4.2 ✔ purrr 0.2.5
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library("readxl")
library("ggplot2")
library("plotly")
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library("gapminder")
library("dplyr")
tmp<-tempfile(fileext=".rds")
download.file("https://github.com/MEF-BDA503/gpj18-r_coders/blob/master/Data_Sources_Rds/imp_data_final.rds?raw=true",
destfile=tmp,mode = 'wb')
imp_data_final<-read_rds(tmp)
file.remove(tmp)
## [1] TRUE
tmp<-tempfile(fileext=".rds")
download.file("https://github.com/MEF-BDA503/gpj18-r_coders/blob/master/Data_Sources_Rds/exp_data_final.rds?raw=true",
destfile=tmp,mode = 'wb')
exp_data_final<-read_rds(tmp)
file.remove(tmp)
## [1] TRUE
tmp<-tempfile(fileext=".rds")
download.file("https://github.com/MEF-BDA503/gpj18-r_coders/blob/master/Data_Sources_Rds/imp_data.rds?raw=true?raw=true",
destfile=tmp,mode = 'wb')
imp_data<-read_rds(tmp)
file.remove(tmp)
## [1] TRUE
tmp<-tempfile(fileext=".rds")
download.file("https://github.com/MEF-BDA503/gpj18-r_coders/blob/master/Data_Sources_Rds/exp_data.rds?raw=true?raw=true",
destfile=tmp,mode = 'wb')
exp_data<-read_rds(tmp)
file.remove(tmp)
## [1] TRUE
tmp<-tempfile(fileext=".rds")
download.file("https://github.com/MEF-BDA503/gpj18-r_coders/blob/master/Data_Sources_Rds/Producer_Inflation.rds?raw=true",
destfile=tmp,mode = 'wb')
producer_inf<-read_rds(tmp)
file.remove(tmp)
## [1] TRUE
# Create a temporary file
tmp=tempfile(fileext=".xls")
# Download file from repository to the temp file
download.file("https://github.com/MEF-BDA503/gpj18-r_coders/blob/master/Data_Sources_Excel/export_import_sectors.xls?raw=true",
destfile=tmp,mode='wb')
# Read that excel file.
sectors <- read_excel(tmp)
## readxl works best with a newer version of the tibble package.
## You currently have tibble v1.4.2.
## Falling back to column name repair from tibble <= v1.4.2.
## Message displays once per session.
# Remove the temp file
file.remove(tmp)
## [1] TRUE
tmp<-tempfile(fileext=".rds")
download.file("https://github.com/MEF-BDA503/gpj18-r_coders/blob/master/Data_Sources_Rds/US_Dollar_Montly_Rate.rds?raw=true",
destfile=tmp,mode = 'wb')
usd_rate<-read_rds(tmp)
file.remove(tmp)
## [1] TRUE
names(exp_data_final)[names(exp_data_final) == 'Date'] <- 'Export_Date'
names(exp_data)[names(exp_data) == 'Date'] <- 'Export_Date'
names(imp_data_final)[names(imp_data_final) == 'Date'] <- 'Import_Date'
names(imp_data_final)[names(imp_data_final) == 'Export_Total_Amount']<-'Import_Total_Amount'
names(imp_data)[names(imp_data) == 'Date'] <- 'Import_Date'
library("dplyr")
exp_data <- inner_join(exp_data,sectors, by=c("Sector_Type_Code"="Sub_Sector_Type_Code"))
imp_data <- inner_join(imp_data,sectors, by=c("Sector_Type_Code"="Sub_Sector_Type_Code"))
exp_data$Export_Year<-as.numeric(format(exp_data$Export_Date,"%Y"))
exp_data$Export_Year_Month<-format(exp_data$Export_Date,"%Y-%m")
exp_data_final$Export_Year<-as.numeric(format(exp_data_final$Export_Date,"%Y"))
exp_data_final$Export_Year_Month<-format(exp_data_final$Export_Date,"%Y-%m")
imp_data$Import_Year<-as.numeric(format(imp_data$Import_Date,"%Y"))
imp_data$Import_Year_Month<-format(imp_data$Import_Date,"%Y-%m")
imp_data_final$Import_Year<-as.numeric(format(imp_data_final$Import_Date,"%Y"))
imp_data_final$Import_Year_Month<-format(imp_data_final$Import_Date,"%Y-%m")
imp_data<- imp_data %>%
select (Import_Date,Sector_Type_Code,Sector_Type_Code.y,Main_Sector_Flag,Sector_Name_Eng,
Amount,Import_Year,Import_Year_Month)
exp_data<- exp_data %>%
select (Export_Date,Sector_Type_Code,Sector_Type_Code.y,Main_Sector_Flag,Sector_Name_Eng,
Amount,Export_Year,Export_Year_Month)
colnames(imp_data)[colnames(imp_data) == 'Amount'] <- 'Import_Amount'
colnames(exp_data)[colnames(exp_data) == 'Amount'] <- 'Export_Amount'
colnames(imp_data)[colnames(imp_data) == 'Sector_Type_Code'] <- 'Sub_Sector_Type_Code'
colnames(exp_data)[colnames(exp_data) == 'Sector_Type_Code'] <- 'Sub_Sector_Type_Code'
colnames(imp_data)[colnames(imp_data) == 'Sector_Type_Code.y'] <- 'Sector_Type_Code'
colnames(exp_data)[colnames(exp_data) == 'Sector_Type_Code.y'] <- 'Sector_Type_Code'
imp_data$Import_Amount[is.na(imp_data$Import_Amount)] <- 0
imp_data_final$Import_Total_Amount[is.na(imp_data_final$Import_Total_Amount)] <- 0
exp_data$Export_Amount[is.na(exp_data$Export_Amount)] <- 0
exp_data_final$Export_Total_Amount[is.na(exp_data_final$Export_Total_Amount)] <- 0
exp_data_final <- exp_data_final %>%
filter(Export_Date<'2018-11-01')
exp_data <- exp_data %>%
filter(Export_Date<'2018-11-01')
imp_data_final <- imp_data_final %>%
filter(Import_Date<'2018-11-01')
imp_data <- imp_data %>%
filter(Import_Date<'2018-11-01')
saveRDS(imp_data,file="imp_data_v2.rds")
saveRDS(imp_data_final,file="imp_data_final_v2.rds")
saveRDS(exp_data,file="exp_data_v2.rds")
saveRDS(exp_data_final,file="exp_data_final_v2.rds")
exp_share_sectors <-
exp_data %>%
filter(Main_Sector_Flag==1 & Export_Date<'2018-11-01')%>%
group_by(Sector_Name_Eng) %>%
summarize(Export_Amount_Share=sum(Export_Amount)) %>%
mutate (Export_Amount_Share=round((Export_Amount_Share/sum(Export_Amount_Share)),4))
exp_share_sectors$share_z <-
round((exp_share_sectors$Export_Amount_Share
- mean(exp_share_sectors$Export_Amount_Share))/
sd(exp_share_sectors$Export_Amount_Share), 2)
exp_share_sectors$above_or_below <- ifelse(exp_share_sectors$share_z < 0, "Below", "Above")
exp_share_sectors <- exp_share_sectors[order(exp_share_sectors$share_z), ]
exp_share_sectors$Sector_Name_Eng <- factor(exp_share_sectors$Sector_Name_Eng,
levels = exp_share_sectors$Sector_Name_Eng)
theme_set(theme_bw())
ggplot(exp_share_sectors, aes(x= share_z , y= Sector_Name_Eng, label=share_z)) +
xlab("Z") +
ylab("Sector Name") +
ggtitle("Export Sector Share")+
geom_point(stat='identity', aes(col=above_or_below), size=6) #+
exp_share_sectors1 <- exp_data %>%
filter(grepl("Manufacture",exp_data$Sector_Name_Eng) & exp_data$Export_Year==2018)
ggplot(exp_share_sectors1, aes(x= exp_share_sectors1$Sub_Sector_Type_Code ,
y= exp_share_sectors1$Export_Amount,
label=exp_share_sectors1$Sub_Sector_Type_Code)) +
xlab("Manufacturing Sub Sectors") +
ylab("Total Export") +
ggtitle("Manufacturing SubSector Based Export-2018")+
geom_point(stat='identity', aes(col=exp_share_sectors1$Export_Year_Month), size=6)
tmp<-tempfile(fileext=".rds")
download.file("https://github.com/MEF-BDA503/pj18-efehandanisman/blob/master/timeshighereducation/ranking2019.rds?raw=true?raw=true",
destfile=tmp,mode = 'wb')
education_data <-read_rds(tmp)